In [15]:
import gzip
import tarfile
import numpy as np
import pandas as pd
import h5py as h5
import os
import os.path as osp
import glob
from sklearn import preprocessing
import math
import time
from scipy.spatial.distance import cosine
from itertools import combinations
In [44]:
data_path = os.path.join('MillionSongSubset', 'AdditionalFiles', 'subset_msd_summary_file.h5')
features = ['duration', 'end_of_fade_in','key', 'loudness', 'mode', 'start_of_fade_out', 'tempo', 'time_signature']
verbose = False
In [35]:
def get_feature_matrix(feature, data):
'''
Reads the data and the feature names and returns the track ids and the feature matrix.
The track_id field from the data is mandatory, therefore it will always be included
Args:
feature_names(list of strings): list containing the feature names that will be included
in the feature matrix.
data(pandas.io.pytables.HDFStore table): table containing the data.
Returns:
(numpy.ndarray, numpy.ndarray): (N, 1) of track_ids, feature matrix (N, D).
'''
if 'track_id' in feature:
songs = np.asarray(data[osp.join('analysis','songs')][feature])
else:
songs = np.asarray(data[osp.join('analysis','songs')][['track_id'] + feature])
return np.array(songs[:, 0]), np.array(songs[:, 1:], dtype=np.float64)
def get_random_vector(n):
'''
Returns a vector with normal distributed values {-1,1}.
Args:
n (int) : size of the vector.
Returns:
ndarray : list of length n of normal distributed values {-1,1}.
'''
return 2*np.random.randint(0, 2, n) - 1
def cosine_angle(a, b):
'''
Returns the cosine of the angle of two given vectors
Args:
a(numpy.ndarray): vector of real values.
b(numpy.ndarray): vector of real values.
Returns:
double: the cosine of the angle between a and b.
'''
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def cosine_distance(a, b):
'''
Returns the cosine distance between two vectors.
Args:
a(numpy.ndarray): vector of real values.
b(numpy.ndarray): vector of real values.
Returns:
double: the cosine distance of a and b.
'''
return 1.0 - cosine_angle(a, b)
def get_hash_bands(sketch, r, b):
'''
Computes the signature matrix (#samples, #bands) with hash-values for the different song based on the respective bands.
Args:
sketch(numpy.ndarray): sketch matrix (#samples, #rows*#bands) with values in domain {0,1}.
r(int): number of rows
b(int): number of bands
Returns:
(numpy.ndarray, numpy.ndarray): (#samples, #bands) with hash-values for the different song based on the respective bands.
'''
twos = np.array([1<<i for i in range(r)])
hashes = np.zeros((np.size(sketch, 0), b))
for i in range(b):
hashes[:,i] = np.dot(sketch[:, (r*i):(r*(i+1))], twos)
return hashes.astype(np.uint64)
In [45]:
'''
Operations:
- Get data
- Build feature matrix
- 0-1 normalize it
track_ids: matrix(#samples x 1)
feature_matrix: matrix(#samples x #features)
'''
songs = pd.HDFStore(data_path)
track_ids, feature_matrix = get_feature_matrix(features, songs)
feature_matrix = preprocessing.scale(feature_matrix)
if verbose:
print("Shape track_ids:",track_ids.shape)
print("Shape feature matrix:",feature_matrix.shape)
In [38]:
# data and algorithm parameters
'''
D = number of features
N = number of samples
b = number of bands
r = number of rows
eps = angle threshold(degrees)
'''
D = np.size(feature_matrix,1)
N = np.size(feature_matrix, 0)
b = 3
r = 64
eps = 2
In [46]:
'''
Operations:
- Generate matrix of random vectors with values in {-1,1}.
RV: matrix(#bands*#rows x n_features)
'''
RV = np.array([get_random_vector(D) for i in range(b*r)])
if verbose:
print("Shape RV:",np.shape(RV))
print("Random vectors matrix RV:\n",RV)
In [47]:
'''
Operations:
- Generate sketch matrix, by performing
Clip sketch matrix to 0-1 range for hashing.
Dimensionality: n_samples x n_bands*n_rows
'''
Sketch = np.dot(feature_matrix, RV.T)
Sketch[Sketch < 0] = -1
Sketch[Sketch > 0] = 1
Sketch[Sketch == 0] = np.random.randint(0,2)*2-1
if verbose:
print("Shape Sketch:",Sketch.shape)
print("Sketch:\n",Sketch)
In [48]:
# clip values of Sketch matrix in domain {0,1} to easily hash them.
Sketch[Sketch < 0] = 0
if verbose:
print("Shape Binary Sketch:",Sketch.shape)
print("Binary Sketch:\n",Sketch)
In [49]:
hb = get_hash_bands(Sketch,r, b)
if verbose:
print("Shape hb:",hb.shape)
print("hb:\n",hb)
In [43]:
'''
candidates(dict): Dictionary with key=(song_id,song_id), value=cosine_distance(song_id,song_id)
duplicates(list): List of tuples (songid, songid)
buckets(dict) : Dictionary with key=band_id, value=dict with key=hash_key, value = list of song_id
'''
candidates = {}
duplicates = []
buckets = { i : {} for i in range(b) }
start = time.time()
for i in range(b):
for j in range(N):
hash_key = hb[j,i]
if hash_key not in buckets[i]:
buckets[i][hash_key] = []
buckets[i][hash_key].append(j)
for candidates_list in buckets[i].values():
if len(candidates_list) > 1:
for _i in range(len(candidates_list)):
for _j in range(_i+1,len(candidates_list)):
songA = candidates_list[_i]
songB = candidates_list[_j]
if (songA,songB) not in candidates:
candidates[(songA,songB)] = cosine_distance(feature_matrix[songA,:],feature_matrix[songB,:])
cos_eps_dist = 1-math.cos(math.radians(eps))
for key in candidates.keys():
if candidates[key] < cos_eps_dist:
songA = key[0]
songB = key[1]
duplicates.append((songA,songB))
print("LSH Duration:", time.time() - start,"sec")
print("Nr. candidates:", len(candidates.keys()))
print("Nr. duplicates:",len(duplicates))